import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import hist
import seaborn as sns
from IPython.display import display
import random
import math
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from helper_functions import show_images, do_pca, scree_plot, plot_component
from sklearn.cluster import KMeans
import missingno as msno
import seaborn as sns
sns.set(font_scale = 2)
%matplotlib inline
# Load in the general demographics data.
base_path = './data'
applicant = pd.read_csv(os.path.join(base_path, "applicant.csv"),sep=",")
# Load in the feature summary file.
loan = pd.read_csv(os.path.join(base_path, "loan.csv"), sep=",")
applicant.head()
loan.head()
print("applicant.csv")
display(applicant.describe())
print("loan.csv")
display(loan.describe())
print("applicant.csv")
display(applicant.dtypes)
print("loan.csv")
display(loan.dtypes)
msno.matrix(applicant)
msno.bar(applicant)
msno.matrix(loan)
msno.bar(loan)
applicant_nan = applicant.isna().sum().to_frame('nans').sort_values(by='nans',ascending=False)
applicant_nan
loan_nan = loan.isna().sum().to_frame('nans').sort_values(by='nans',ascending=False)
loan_nan
From below we can see that applicant_id seems like a good fit and we can merge the two datasets using the feature.
print(applicant.shape, loan.shape)
(applicant.applicant_id == loan.applicant_id).sum()
applicant_loan = pd.merge(applicant, loan, left_on = 'applicant_id', right_on = 'applicant_id')
applicant_loan.head(10)
assert(applicant_loan.shape[0] == applicant.shape[0])
assert(applicant_loan.shape[1] == applicant.shape[1] + loan.shape[1] - 1)
applicant_loan_nan = applicant_loan.isna().sum().to_frame('nans').sort_values(by='nans',ascending=False)
applicant_loan_nan['nans'] = (applicant_loan_nan['nans']/applicant_loan.shape[0])
applicant_loan_nan
drop_columns = list(applicant_loan_nan[applicant_loan_nan['nans']>0.3].index)
drop_columns.extend(['applicant_id', 'loan_application_id'])
print(applicant_loan.shape)
applicant_loan_v1 = applicant_loan.drop(drop_columns, axis='columns')
print(drop_columns)
print(applicant_loan.shape)
print(applicant_loan_v1.shape)
missing_val_row = pd.DataFrame(applicant_loan_v1.isnull().sum(axis=1),columns=['missing_values'])
missing_val_row['temp'] = 1
missing_val_row.groupby(['missing_values']).sum().sort_values(ascending=False, by = 'temp')
applicant_loan_v1.isnull().sum(axis=1).hist()
(135+22+1)/1000
temp_dtype=applicant_loan_v1.dtypes.to_frame('dtype').reset_index()
temp_dtype[temp_dtype['dtype']=='int64']
print("object:")
for col in temp_dtype[temp_dtype['dtype']==object]['index']:
print(col,'::::::::::',applicant_loan_v1[col].nunique(),applicant_loan_v1[col].unique())
print("in64:")
for col in temp_dtype[temp_dtype['dtype']=='int64']['index']:
if applicant_loan_v1[col].nunique() <= 10:
print(col,'::::::::::',applicant_loan_v1[col].nunique(), applicant_loan_v1[col].unique())
else:
print(col,'::::::::::',applicant_loan_v1[col].nunique())
First we have to sort the features into these categories:
This division should be done carefully.
categorical_feature=['Marital_status','Housing', 'Employment_status','Savings_account_balance','Purpose','Property','Loan_history','Has_been_employed_for_at_least','Has_been_employed_for_at_most','Foreign_worker', 'Has_coapplicant','Has_guarantor','Gender']
binary_features = ['high_risk_applicant']
mixed_features = ['Has_been_employed_for_at_least','Has_been_employed_for_at_most']
numerical_features = ['Primary_applicant_age_in_years','Number_of_dependents', 'Years_at_current_residence', 'Months_loan_taken_for', 'Principal_loan_amount','EMI_rate_in_percentage_of_disposable_income', 'Number_of_existing_loans_at_this_bank']
# Fill nan
applicant_loan_v1[mixed_features[0]] = [applicant_loan_v1[mixed_features[1]].iloc[n] if str(i) == 'nan' else i for n,i in enumerate(applicant_loan_v1[mixed_features[0]])]
applicant_loan_v1[mixed_features[1]] = [applicant_loan_v1[mixed_features[0]].iloc[n] if str(i) == 'nan' else i for n,i in enumerate(applicant_loan_v1[mixed_features[1]])]
# #### Convert the mixed feature to numerical features
#mixed_features = ['Has_been_employed_for_at_least','Has_been_employed_for_at_most']
# Here there are few point to be noted.
# - I feel that these columns should not be as categorical as the number of years should have some numerical weight to it and would aid the model more in classifying the applicant.
# - These two mixed features have nan values but there is a pattern to it. There is a nan value in 'Has_been_employed_for_at_least' when the feature 'Has_been_employed_for_at_most' has the value '0 years' and similarly the feature 'Has_been_employed_for_at_most' has a nan value when 'Has_been_employed_for_at_least' has the value '7 years'. I feel there is no harm in eliminating the nan values by replacing them with the same value as in the other feature.
# applicant_loan_v1[mixed_features].iloc[:30]
# # Fill nan
# applicant_loan_v1[mixed_features[0]] = [applicant_loan_v1[mixed_features[1]].iloc[n] if str(i) == 'nan' else i for n,i in enumerate(applicant_loan_v1[mixed_features[0]])]
# applicant_loan_v1[mixed_features[1]] = [applicant_loan_v1[mixed_features[0]].iloc[n] if str(i) == 'nan' else i for n,i in enumerate(applicant_loan_v1[mixed_features[1]])]
# # Convert the mixed feature to numerical features
# applicant_loan_v1[mixed_features[0]] = [str(i)[0] for i in applicant_loan_v1[mixed_features[0]]]
# applicant_loan_v1[mixed_features[1]] = [str(i)[0] for i in applicant_loan_v1[mixed_features[1]]]
# applicant_loan_v1[mixed_features]
applicant_loan_v2 = pd.get_dummies(applicant_loan_v1, dummy_na=True, columns = categorical_feature)
applicant_loan_v2.shape
applicant_loan_v2.isna().sum().to_frame('nans').sort_values(by='nans',ascending=False).sum()
Apply feature scaling to the data.
There is one point which we need to give thought to:
Does it make sense to apply transformation to one hot encoded features? I believe it does not, as One-hot encoding implies the level of the measurement for a feature is nominal / categorial. Standardization implies the level of measure for a features is at least interval.
Example, if the feature is country of origin. Since that feature is categorical, one-hot encoding makes sense. A person is from a country or not. Taking the mean of the country of origin yields numbers that do not make sense.
So i will only apply feature transformation on variables that are not categorical or binary features.
Reference: Link
scaler = StandardScaler()
features_transform = numerical_features
applicant_loan_v2[features_transform] = scaler.fit_transform(applicant_loan_v2[features_transform])
applicant_loan_v2.head()
applicant_loan_v2.shape
# for categorical data
def counterplot(df, features, target=True):
fig,axes = plt.subplots(math.ceil(len(features)/2),2,figsize=(28,70))
for idx,cat_col in enumerate(features):
row,col = idx//2,idx%2
if target == True:
ax = sns.countplot(x=cat_col,data=df,hue='high_risk_applicant',ax=axes[row,col])
else:
ax = sns.countplot(x=cat_col,data=df,ax=axes[row,col])
ax.set_xticklabels(ax.get_xticklabels(), rotation=10, ha="right",fontsize=15)
plt.subplots_adjust(hspace=0.6)
other_numerical_features = np.setdiff1d(numerical_features,['Primary_applicant_age_in_years','Months_loan_taken_for','Principal_loan_amount'])
features = list(categorical_feature) + list(other_numerical_features)
counterplot(applicant_loan_v1, features)
#For 'Primary_applicant_age_in_years','Months_loan_taken_for','Principal_loan_amount'
def boxplotter(df,xaxis=True):
fig,axes = plt.subplots(1,3,figsize=(40,15))
# sns.set(font_scale = 0.5)
for idx,cat_col in enumerate(['Primary_applicant_age_in_years','Months_loan_taken_for','Principal_loan_amount']):
if xaxis == True:
p = sns.boxplot(y=cat_col,data=df,x='high_risk_applicant',ax=axes[idx])
p.set_xlabel("high_risk_applicant", fontsize = 20)
else:
p = sns.boxplot(y=cat_col,data=df,ax=axes[idx])
p.set_xlabel("X-Axis", fontsize = 20)
p.set_ylabel(cat_col, fontsize = 20)
p.ticklabel_format(style='plain', axis='y')
# print(train_df[numerical_columns].describe())
plt.subplots_adjust(hspace=1)
boxplotter(applicant_loan_v1)
list(categorical_feature) + list(other_numerical_features)
def ratio_plotter(df, features, cluster=False):
fig,axes = plt.subplots(math.ceil(len(features)/2),2,figsize=(28,70))
print(categorical_feature)
for idx,cols in enumerate(features):
row,col = idx//2,idx%2
lst1 = applicant_loan_v1[cols].unique()
lst1 = [x for x in lst1 if x==x]
if cluster == False:
lst2 = [(applicant_loan_v1[(applicant_loan_v1[cols] == value) & (applicant_loan_v1['high_risk_applicant'] == 0)].shape[0])/(((applicant_loan_v1[(applicant_loan_v1[cols] == value) & (applicant_loan_v1['high_risk_applicant'] == 0)].shape[0]) + (applicant_loan_v1[(applicant_loan_v1[cols] == value) & (applicant_loan_v1['high_risk_applicant'] == 1)].shape[0]))) for value in lst1]
else:
lst2 = [(df[(df[cols] == value)].shape[0])/applicant_loan_v1[(applicant_loan_v1[cols] == value)].shape[0] for value in lst1]
df1 = pd.DataFrame(lst2)
df1.columns = ['ratio']
df1[cols] = lst1
ax = sns.barplot(x=df1[cols], y = df1['ratio'], ax=axes[row,col])
ax.set_xticklabels(ax.get_xticklabels(), rotation=10, ha="right",fontsize=15)
plt.subplots_adjust(hspace=1)
ratio_plotter(applicant_loan_v1,features)
First of all let us understand what the ratio means in the graphs plotted above: For a given category, the proportion of low risk applicants to that of total applicant(in that category). So higher the value for that category the more reliable or less risky the category of applicants are.
Now lets start with the interpretation for each of the plots above:
Marital_status:
Housing:
Employment status:
Here all the values are pretty smilar but from the minor difference, the results seem to go exactly opposite to our intiution, which is unskilled-resident applicants are of lowest risk and management/ highly skilled officials are at highest risk.
This may be due to the kind of loan they take, unskilled residents are likely to take small loans which will most likely be approved but the highly skilled officials might take big loans which sometime might now be approved.
Savings_account_balance:
Purpose:
Property:
Loan_history:
Has_been_employed_for_at_least and Has_been_employed_for_at_most:
The last few features are binary, to conclude applicants who are foreign workers, who have a co-applicant, do not have a gaurantor and who are females are considered high risk.
And low EMI rate is considered as less risky, whereas number of dependents do not matter much. And applicants with 2 or 3 existing loans at the bank increases reliability and hence reduces risk.
I believe some of the features which are not behaving as expected are doing so because these features are correlated with other features and the anomalies are due to the trends and patterns that the other features follow.
For this section we do not have any need of the target variable i.e. high_risk_applicant. It is an unsupervised method similar to Principal Component Analysis that only looks at the relationship among the input variables.
So we will drop the target variable and perfrom pca and clustering on the data.
applicant_loan_v3 = applicant_loan_v2.drop(['high_risk_applicant'], axis='columns')
applicant_loan_v3.shape
# Apply PCA to the data.
pca = PCA()
applicant_loan_pca = pca.fit_transform(applicant_loan_v3)
The scree plot is used to determine the number of factors to retain in an exploratory factor analysis (FA) or principal components to keep in a principal component analysis (PCA).
We will check the variance that is represented by the number of pca components, and we will keep only as much components which is enough to represent 90% variance of the original data.
Finally, keeping only 23 components as its represnts 92% of the variance of the data, which seems good enough.
It was also observed that the last 20 components had negligible impact on capturing the variability, this shows the capability of PCA to do dimensionality reduction without any loss of variability.
scree_plot(pca)
for i in np.arange(1, len(pca.explained_variance_ratio_)):
print('{} components explains {}% of variance'.format(i,(pca.explained_variance_ratio_[:i].sum()) * 100))
# Re-apply PCA to the data while selecting for number of components to retain.
pca = PCA(n_components=23)
applicant_loan_pca = pca.fit_transform(applicant_loan_v3)
Now here comes the main part, please read it carefully and I will try to explain it as best I can.
I really think I'm onto something here!
def pca_results(full_dataset, pca, comp_number,feature_count):
'''
Create a DataFrame of the PCA results
Includes dimension feature weights and explained variance
Visualizes the PCA results
'''
# PCA components
component = pd.DataFrame(np.round(pca.components_, 4), columns = full_dataset.keys()).iloc[comp_number-1]
component.sort_values(ascending=False, inplace=True)
# Capturing most interesting features which at the end and beginning of sorted list
component = pd.concat([component.head(feature_count), component.tail(feature_count)])
component.plot(kind='bar',
title='top {} weighted features for PCA component {}'.format(feature_count*2, comp_number),
figsize=(12, 6));
plt.show()
return component
# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
pca_results(applicant_loan_v3,pca,1,5)
pca_results(applicant_loan_v3,pca,2,5)
pca_results(applicant_loan_v3,pca,3,5)
pca_results(applicant_loan_v3,pca,11,5)
We know that if two features have large weights of the same sign (both positive or both negative), then increases in one tend expect to be associated with increases in the other. To contrast, features with different signs can be expected to show a negative correlation: increases in one variable should result in a decrease in the other.
Positively weighted features in 1st component:
Principal_loan_amount 0.5279
Months_loan_taken_for 0.4786
Primary_applicant_age_in_years 0.3399
Years_at_current_residence 0.2901
Number_of_existing_loans_at_this_bank 0.2452
Negatively weighted features in 1st component:
Has_been_employed_for_at_most_4 years -0.0717
Property_real estate -0.0899
Gender_female -0.1184
Marital_status_divorced/separated/married -0.1184
Loan_history_existing loans paid back duly till now -0.1220
Positively weighted features in 2nd component:
Principal_loan_amount 0.4542
Months_loan_taken_for 0.4280
Loan_history_existing loans paid back duly till now 0.1028
Gender_female 0.0698
Marital_status_divorced/separated/married 0.0698
Negatively weighted features in 2nd component:
Number_of_dependents -0.2206
EMI_rate_in_percentage_of_disposable_income -0.2445
Years_at_current_residence -0.3003
Number_of_existing_loans_at_this_bank -0.3194
Primary_applicant_age_in_years -0.4352
Positively weighted features in 3rd component:
Number_of_dependents 0.5362
Number_of_existing_loans_at_this_bank 0.2662
Principal_loan_amount 0.0756
Employment_status_unskilled - resident 0.0711
Gender_male 0.0662
Negatively weighted features in 3rd component:
Has_been_employed_for_at_most_nan -0.0863
Has_been_employed_for_at_least_7 years -0.0863
Months_loan_taken_for -0.2427
Years_at_current_residence -0.3294
EMI_rate_in_percentage_of_disposable_income -0.6175
Positively weighted features in 11th component:
Principal_loan_amount 0.2884
Savings_account_balance_nan 0.2858
Employment_status_management / self-employed / highly qualified employee / officer 0.1961
EMI_rate_in_percentage_of_disposable_income 0.1910
Property_car or other 0.1784
Negatively weighted features in 11th component:
Has_been_employed_for_at_most_1 year -0.1748
Purpose_electronic equipment -0.1848
Property_real estate -0.3089
Months_loan_taken_for -0.4202
Savings_account_balance_Low -0.4570
%%time
def kmeans_score(data, no_clusters):
km = KMeans(n_clusters=no_clusters, max_iter=25)
model = km.fit(data)
score = np.abs(model.score(data))
return score
no_clusters = 20
scores = []
centers= []
# Over a number of different cluster counts...
# run k-means clustering on the data and...
# compute the average within-cluster distances
for i in range(2,no_clusters,5):
centers.append(i)
score = kmeans_score(applicant_loan_pca, i)
print(score)
scores.append(score)
#Investigate the change in within-cluster distance across number of clusters.
# elbow method to see how many clusters are best. For that we need to plot
# sum of squared euclidean distance which we get from .score function in kmeans vs no_clusters
plt.plot(centers, scores, linestyle='--', marker='o', color='r');
plt.xlabel('no_Centers');
plt.ylabel('SSE');
plt.title('SSE vs. no_Centers');
# Re-fit the k-means model with the selected number of clusters and obtain
# cluster predictions for the general population demographics data.
k = 7
model = KMeans(k, random_state=1234, max_iter=25, n_jobs=-1).fit(applicant_loan_pca)
applicant_loan_clustered = model.predict(applicant_loan_pca)
set(applicant_loan_clustered.tolist())
cluster_indexing = [[n for n,i in enumerate(applicant_loan_clustered) if i == j] for j in [0, 1, 2, 3, 4, 5, 6]]
[print("Length of cluster {} : {}".format(i,len(cluster_indexing[i]))) for i in [0,1,2,3,4,5,6]]
As discussed in this above cell, we know what value in a feature an applicant should have in order to make them more credible and hence get approved for a loan. And from the this discussion regarding assigning weights to features, we also have an idea which feature is more important and which is less important.
And from the above knowledge, we can better interpret these graphs below and choose which customer segment/ cluster gets approved for a loan.
In this section below i have plotted bar plots for categorical data and box plots for numerical data.
The bar plots are plotted for each feature where each bar represents each category in that feature. The bars do not just represent the count of the value in that category as any dominating category would be over represented in each cluster. So instead I decided to plot a ratio denoted as follows:
(Count of the category in that cluster)/ (total count of the category in the dataset)
ratio_plotter(applicant_loan_v1.loc[cluster_indexing[0],:], features, cluster = True)
boxplotter(applicant_loan_v1.loc[cluster_indexing[0],:],xaxis=False)
ratio_plotter(applicant_loan_v1.loc[cluster_indexing[1],:], features, cluster = True)
boxplotter(applicant_loan_v1.loc[cluster_indexing[1],:],xaxis=False)
ratio_plotter(applicant_loan_v1.loc[cluster_indexing[2],:], features, cluster = True)
boxplotter(applicant_loan_v1.loc[cluster_indexing[2],:],xaxis=False)
ratio_plotter(applicant_loan_v1.loc[cluster_indexing[3],:], features, cluster = True)
boxplotter(applicant_loan_v1.loc[cluster_indexing[3],:],xaxis=False)
ratio_plotter(applicant_loan_v1.loc[cluster_indexing[4],:], features, cluster = True)
boxplotter(applicant_loan_v1.loc[cluster_indexing[4],:],xaxis=False)
ratio_plotter(applicant_loan_v1.loc[cluster_indexing[5],:], features, cluster = True)
boxplotter(applicant_loan_v1.loc[cluster_indexing[5],:],xaxis=False)
ratio_plotter(applicant_loan_v1.loc[cluster_indexing[6],:], features, cluster = True)
boxplotter(applicant_loan_v1.loc[cluster_indexing[6],:],xaxis=False)
The comparison of the bar plots for the cluster with the bar plots of the overall dataset can be interpreted as follows, i have done it for one cluster which is 'cluster 0' but it can be similarly done for each of the 7 clusters.
So based on the above observation there are 10 supporting and 6 opposing evidences
By doing similar comparison for all the clusters we can say that cluster 1, 3 and 6 can be approved for loan!